suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "/Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/Shortread/Stringtie_tximport_DESeq2/')
tabledir <- paste0(wd, 'Tables/Shortread/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

add_genetype2 <- function(df) {
  
  df |> 
    mutate(
      genetype2 = case_when(
        gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
        gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
        grepl('MT-RNR', gene_name) & seqname == 'chrM' ~ 'mt-rRNA',
        grepl('MT-T', gene_name) & seqname == 'chrM' ~ 'mt-tRNA',
        is.na(gene_type) ~ 'unannotated gene', 
        .default = 'other ncRNAs' 
      )
    )
  
}

add_isDEG <- function(df) {
  
  df |> 
    rowwise() |> 
    mutate(
      isUp = case_when(
        max(siMETTL2A_G_pvalue, siMETTL2A_I_pvalue) < .05 & 
          min(siMETTL2A_G_log2FoldChange, siMETTL2A_I_log2FoldChange) > 0
        ~ 'common',
        siMETTL2A_G_pvalue < .05 & siMETTL2A_G_log2FoldChange > 0
        ~ 'only G',
        siMETTL2A_I_pvalue < .05 & siMETTL2A_I_log2FoldChange > 0
        ~ 'only I',
        .default = 'not'),
      isDown = case_when(
        max(siMETTL2A_G_pvalue, siMETTL2A_I_pvalue) < .05 & 
          max(siMETTL2A_G_log2FoldChange, siMETTL2A_I_log2FoldChange) < 0
        ~ 'common',
        siMETTL2A_G_pvalue < .05 & siMETTL2A_G_log2FoldChange < 0
        ~ 'only G',
        siMETTL2A_I_pvalue < .05 & siMETTL2A_I_log2FoldChange < 0
        ~ 'only I',
        .default = 'not')
    ) |> 
    mutate(
      common_DEGs = case_when(
        isUp   == 'common' ~ 'up',
        isDown == 'common' ~ 'down',
        .default = 'other'
      )
    ) |> 
    ungroup()
  
}

exclude_genes_with_samename <- function(df) {
  
  df |> 
    filter(!grepl('[|]', gene_id))
  
}

add_methylation_info <- function(df) {
  
  df |>
    left_join(DRS_methylated_genes) |> 
    replace_na(list(methylation = '-')) 
  
}

Read data

List of methylated genes

DRS_methylated_genes <- 
  read_tsv(
    paste0(wd, 'Tables/DRS_m3C_sites/methylated_positions_2024-03-29.tsv')
  ) |> 
  select(gene_id) |> 
  distinct() |> 
  mutate(methylation = '+')
## Rows: 632 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (13): transcript_id, kmer, seqname, source, feature, score, strand, fram...
## dbl  (3): position, start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_genes
## # A tibble: 80 × 2
##    gene_id            methylation
##    <chr>              <chr>      
##  1 ENSG00000008988.11 +          
##  2 ENSG00000086548.9  +          
##  3 ENSG00000089009.16 +          
##  4 ENSG00000240972.2  +          
##  5 ENSG00000026025.16 +          
##  6 ENSG00000111640.15 +          
##  7 ENSG00000111775.3  +          
##  8 ENSG00000112306.8  +          
##  9 ENSG00000034510.6  +          
## 10 ENSG00000116251.11 +          
## # ℹ 70 more rows
espresso_AsPC1_geneinfo <-  
  read_tsv(
    paste0(wd, 'Tables/Espresso_AsPC1_annotation_cleaned_2024-03-29.tsv')
  ) |> 
  select(gene_id, gene_name, gene_type, seqname) |> 
  distinct()
## Rows: 36717 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): seqname, source, feature, score, strand, frame, gene_id, transcrip...
## dbl  (2): start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_AsPC1_geneinfo
## # A tibble: 13,486 × 4
##    gene_id            gene_name       gene_type      seqname
##    <chr>              <chr>           <chr>          <chr>  
##  1 ENSG00000113851.16 CRBN            protein_coding chr3   
##  2 ENSG00000072756.18 TRNT1           protein_coding chr3   
##  3 ENSG00000170364.13 SETMAR          protein_coding chr3   
##  4 ENSG00000144455.14 SUMF1           protein_coding chr3   
##  5 ENSG00000235978.8  ENSG00000235978 lncRNA         chr3   
##  6 ENSG00000235831.8  BHLHE40-AS1     lncRNA         chr3   
##  7 ENSG00000134107.5  BHLHE40         protein_coding chr3   
##  8 ENSG00000134108.14 ARL8B           protein_coding chr3   
##  9 ENSG00000134109.11 EDEM1           protein_coding chr3   
## 10 ENSG00000189229.12 ENSG00000189229 lncRNA         chr3   
## # ℹ 13,476 more rows

DESeq2 result

shortread_stringtie_txi_DESeq2 <- 
  read_tsv(
    paste0(wd, 'Tables/Shortread/shortread_stringtie_txi_DESeq2_2024-04-16.tsv.gz')
  ) 
## Rows: 13043 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): gene_name, gene_id
## dbl (18): siMETTL2A_baseMean, siMETTL2A_log2FoldChange, siMETTL2A_lfcSE, siM...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
shortread_stringtie_txi_DESeq2
## # A tibble: 13,043 × 20
##    gene_name siMETTL2A_baseMean siMETTL2A_log2FoldChange siMETTL2A_lfcSE
##    <chr>                  <dbl>                    <dbl>           <dbl>
##  1 7SK                      0                     NA             NA     
##  2 A1CF                  4720.                    -0.917          0.0782
##  3 A4GALT                 335.                     1.89           0.567 
##  4 AAAS                  3917.                    -0.298          0.139 
##  5 AACS                  4911.                    -0.146          0.337 
##  6 AADACP1                 61.2                    0.932          0.933 
##  7 AADAT                   15.5                   -2.52           1.93  
##  8 AAGAB                 9386.                    -0.285          0.145 
##  9 AAK1                    54.6                    0.205          0.632 
## 10 AAMDC                 1677.                     0.763          0.529 
## # ℹ 13,033 more rows
## # ℹ 16 more variables: siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>,
## #   siMETTL2A_padj <dbl>, siMETTL2A_G_baseMean <dbl>,
## #   siMETTL2A_G_log2FoldChange <dbl>, siMETTL2A_G_lfcSE <dbl>,
## #   siMETTL2A_G_stat <dbl>, siMETTL2A_G_pvalue <dbl>, siMETTL2A_G_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, …

Preprocessing

Some genes are —-

shortread_stringtie_txi_DESeq2 |> 
  select(gene_name, gene_id) |> 
  filter(grepl('[|]', gene_id))
## # A tibble: 69 × 2
##    gene_name      gene_id                                                       
##    <chr>          <chr>                                                         
##  1 7SK            ENSG00000275933.1|ENSG00000273591.1|ENSG00000274303.1|ENSG000…
##  2 AADACP1        ENSG00000291076.1|ENSG00000240602.10                          
##  3 ABCC13         ENSG00000291052.1|ENSG00000243064.11                          
##  4 ABCC6P1        ENSG00000291057.1|ENSG00000256340.11                          
##  5 ABCC6P2        ENSG00000255277.4|ENSG00000290943.1                           
##  6 AMZ2P1         ENSG00000291140.1|ENSG00000214174.11                          
##  7 ANAPC1P2       ENSG00000285793.1|ENSG00000231259.5                           
##  8 ARMCX5-GPRASP2 ENSG00000271147.8|ENSG00000286237.1                           
##  9 CA5BP1         ENSG00000290746.1|ENSG00000186312.11                          
## 10 CASTOR3P       ENSG00000291122.1|ENSG00000239521.12                          
## # ℹ 59 more rows

Exclude such genes

shortread_stringtie_txi_DESeq2_DEG_methylation <- 
  shortread_stringtie_txi_DESeq2 |> 
  exclude_genes_with_samename() |>
  add_methylation_info() |> 
  add_isDEG() |> 
  left_join(espresso_AsPC1_geneinfo) |> 
  add_genetype2()
## Joining with `by = join_by(gene_id)`
## Joining with `by = join_by(gene_name, gene_id)`
shortread_stringtie_txi_DESeq2_DEG_methylation |> 
  export_tsv(outdir = tabledir, compression = 'gz')
## 
## Exported to: /Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/Tables/Shortread/shortread_stringtie_txi_DESeq2_DEG_methylation_2024-04-17.tsv.gz
## # A tibble: 12,974 × 27
##    gene_name siMETTL2A_baseMean siMETTL2A_log2FoldChange siMETTL2A_lfcSE
##    <chr>                  <dbl>                    <dbl>           <dbl>
##  1 A1CF                  4720.                   -0.917           0.0782
##  2 A4GALT                 335.                    1.89            0.567 
##  3 AAAS                  3917.                   -0.298           0.139 
##  4 AACS                  4911.                   -0.146           0.337 
##  5 AADAT                   15.5                  -2.52            1.93  
##  6 AAGAB                 9386.                   -0.285           0.145 
##  7 AAK1                    54.6                   0.205           0.632 
##  8 AAMDC                 1677.                    0.763           0.529 
##  9 AAMP                 20411.                    0.0309          0.200 
## 10 AAR2                  4967.                   -0.106           0.401 
## # ℹ 12,964 more rows
## # ℹ 23 more variables: siMETTL2A_stat <dbl>, siMETTL2A_pvalue <dbl>,
## #   siMETTL2A_padj <dbl>, siMETTL2A_G_baseMean <dbl>,
## #   siMETTL2A_G_log2FoldChange <dbl>, siMETTL2A_G_lfcSE <dbl>,
## #   siMETTL2A_G_stat <dbl>, siMETTL2A_G_pvalue <dbl>, siMETTL2A_G_padj <dbl>,
## #   siMETTL2A_I_baseMean <dbl>, siMETTL2A_I_log2FoldChange <dbl>,
## #   siMETTL2A_I_lfcSE <dbl>, siMETTL2A_I_stat <dbl>, …

Calculate percentage of DEGs

shortread_stringtie_txi_DESeq2_DEG_methylation |> 
  group_by(methylation) |> 
  reframe(n = n())
## # A tibble: 2 × 2
##   methylation     n
##   <chr>       <int>
## 1 +              79
## 2 -           12895
shortread_stringtie_txi_DESeq2_DEG_genetype_percentage <- 
  shortread_stringtie_txi_DESeq2_DEG_methylation |> 
  group_by(common_DEGs, genetype2) |> 
  reframe(n = n()) |> 
  group_by(genetype2) |> 
  mutate(percentage = 100 * n / sum(n)) |> 
  ungroup()
shortread_stringtie_txi_DESeq2_DEG_genetype_percentage
## # A tibble: 11 × 4
##    common_DEGs genetype2        n percentage
##    <chr>       <chr>        <int>      <dbl>
##  1 down        mRNA          1687      16.5 
##  2 down        other ncRNAs   118       4.29
##  3 other       mRNA          7074      69.3 
##  4 other       mt-mRNA          3      23.1 
##  5 other       mt-tRNA          3      42.9 
##  6 other       other ncRNAs  2404      87.4 
##  7 up          mRNA          1440      14.1 
##  8 up          mt-mRNA         10      76.9 
##  9 up          mt-rRNA          2     100   
## 10 up          mt-tRNA          4      57.1 
## 11 up          other ncRNAs   229       8.32
shortread_stringtie_txi_DESeq2_DEG_methylation_percentage <- 
  shortread_stringtie_txi_DESeq2_DEG_methylation |> 
  group_by(common_DEGs, methylation) |> 
  reframe(n = n()) |> 
  group_by(methylation) |> 
  mutate(percentage = 100 * n / sum(n)) |> 
  arrange(methylation) |> 
  ungroup()
shortread_stringtie_txi_DESeq2_DEG_methylation_percentage
## # A tibble: 6 × 4
##   common_DEGs methylation     n percentage
##   <chr>       <chr>       <int>      <dbl>
## 1 down        +               3       3.80
## 2 other       +              53      67.1 
## 3 up          +              23      29.1 
## 4 down        -            1802      14.0 
## 5 other       -            9431      73.1 
## 6 up          -            1662      12.9
shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage <- 
  shortread_stringtie_txi_DESeq2_DEG_methylation |> 
  group_by(common_DEGs, methylation, genetype2) |> 
  reframe(n = n()) |> 
  group_by(methylation, genetype2) |> 
  mutate(percentage = 100 * n / sum(n)) |> 
  arrange(methylation) |> 
  ungroup()
shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage  
## # A tibble: 16 × 5
##    common_DEGs methylation genetype2        n percentage
##    <chr>       <chr>       <chr>        <int>      <dbl>
##  1 down        +           mRNA             3       4.41
##  2 other       +           mRNA            52      76.5 
##  3 other       +           mt-mRNA          1      11.1 
##  4 up          +           mRNA            13      19.1 
##  5 up          +           mt-mRNA          8      88.9 
##  6 up          +           mt-rRNA          2     100   
##  7 down        -           mRNA          1684      16.6 
##  8 down        -           other ncRNAs   118       4.29
##  9 other       -           mRNA          7022      69.3 
## 10 other       -           mt-mRNA          2      50   
## 11 other       -           mt-tRNA          3      42.9 
## 12 other       -           other ncRNAs  2404      87.4 
## 13 up          -           mRNA          1427      14.1 
## 14 up          -           mt-mRNA          2      50   
## 15 up          -           mt-tRNA          4      57.1 
## 16 up          -           other ncRNAs   229       8.32

Plot

shortread_stringtie_txi_DESeq2_DEG_methylation_percentage_barplot <- 
  shortread_stringtie_txi_DESeq2_DEG_methylation_percentage |> 
  ggplot(aes(x = methylation, y = percentage, fill = common_DEGs)) +
  geom_bar(stat = 'identity') +
  scale_y_reverse() +
  coord_flip() +
  scale_fill_manual(values = c('#3e3ef2', 'grey', '#f23e3e')) 
shortread_stringtie_txi_DESeq2_DEG_methylation_percentage_barplot |> 
  ggsave_multiple_formats(
    width = 3.5, height = 2.5, fontsize = 7, outdir = figdir
  )

shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage_barplot <- 
  shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage |> 
  ggplot(aes(
    x = interaction(methylation |> fct_rev(), genetype2 |> fct_rev()), 
    y = n, fill = common_DEGs
  )) +
  geom_bar(stat = 'identity', position = position_fill()) +
  scale_y_reverse() +
  scale_x_discrete(guide = ggh4x::guide_axis_nested(delim = '.')) +
  scale_fill_manual(values = c('#3e3ef2', 'grey', '#f23e3e')) +
  coord_flip()
shortread_stringtie_txi_DESeq2_DEG_methylation_genetype_percentage_barplot |> 
  ggsave_multiple_formats(
    width = 6, height = 6, fontsize = 7, outdir = figdir
  )
## Warning: The S3 guide system was deprecated in ggplot2 3.5.0.
## ℹ It has been replaced by a ggproto system that can be extended.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

shortread_stringtie_txi_DESeq2_DEG_genetype_percentage_barplot <- 
  shortread_stringtie_txi_DESeq2_DEG_genetype_percentage |> 
  ggplot(aes(
    x = genetype2 |> fct_rev(), 
    y = percentage, fill = common_DEGs
  )) +
  geom_bar(stat = 'identity') +
  scale_y_reverse() +
  coord_flip() +
  scale_fill_manual(values = c('#3e3ef2', 'grey', '#f23e3e')) 
shortread_stringtie_txi_DESeq2_DEG_genetype_percentage_barplot |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir
  )